/**
 * author: Stefan Buettcher
 * created: 2006-09-27
 * changed: 2006-09-27
 **/


#include <map>
#include <set>
#include <string>
#include "relevance_model.h"
#include "../feedback/interpolation_language_model.h"
#include "../misc/all.h"

using namespace std;


#define LOG_ID "RelevanceModel"


RelevanceModel::RelevanceModel(Index *index, offset *docStarts, offset *docEnds,
		double *docScores, int docCount, char **queryTerms, int termCount, int method)
		: LanguageModel(1.0, docCount, true) {
	assert(docCount > 0);
	if (docCount > MAX_DOC_COUNT)
		docCount = MAX_DOC_COUNT;

	corpusSize = 0;
	this->index = index;

	// create language models for all documents
	LanguageModel *documentModels[MAX_DOC_COUNT];
	for (int i = 0; i < docCount; i++)
		documentModels[i] = new LanguageModel(index, docStarts[i], docEnds[i], true);

	switch (method) {
		case METHOD_CONCAT:
			buildModelConcat(documentModels, docCount);
			break;
		case METHOD_WEIGHTED:
			buildModelWeighted(documentModels, docScores, docCount);
			break;
		case METHOD_LAVRENKO_1:
			buildModelLavrenko1(documentModels, docScores, docCount, queryTerms, termCount);
			break;
		case METHOD_LAVRENKO_2:
			buildModelLavrenko2(documentModels, docScores, docCount, queryTerms, termCount);
			break;
		default:
			log(LOG_ERROR, LOG_ID, "Illegal method in RelevanceModel().");
			assert(false);
	}

	for (int i = 0; i < docCount; i++)
		delete documentModels[i];
} // end of RelevanceModel(...)


RelevanceModel::~RelevanceModel() {
} // end of ~RelevanceModel()


static void getTermSet(LanguageModel **docModels, int docCount, set<string> *terms) {
	terms->clear();
	for (int i = 0; i < docCount; i++) {
		int termCount = docModels[i]->getTermCount();
		for (int k = 0; k < termCount; k++) {
			char *term = docModels[i]->getTermString(k);
			terms->insert(term);
			free(term);
		}
	}
} // end of getTermSet(LanguageModel**, int, set<string>*)


void RelevanceModel::buildModelConcat(LanguageModel **docModels, int docCount) {
	set<string> terms;
	getTermSet(docModels, docCount, &terms);
	set<string>::iterator iter;

	corpusSize = 0;
	for (int i = 0; i < docCount; i++)
		corpusSize += docModels[i]->getCorpusSize();

	for (iter = terms.begin(); iter != terms.end(); ++iter) {
		char *term = (char*)iter->c_str();
		offset frequency = 0;
		offset documents = 0;
		for (int i = 0; i < docCount; i++) {
			offset tf = 0, df = 0;
			docModels[i]->getTermInfo(term, &tf, &df);
			frequency += tf;
			if (tf > 0)
				documents++;
		}
		addTerm(term, frequency, documents);
	}
} // end of buildModelConcat(LanguageModel**, int)


void RelevanceModel::buildModelWeighted(LanguageModel **docModels, double *weights, int docCount) {
	set<string> terms;
	getTermSet(docModels, docCount, &terms);
	set<string>::iterator iter;

	double minWeight = 1E9;
	for (int i = 0; i < docCount; i++) {
		assert(weights[i] > 0);
		if (weights[i] < minWeight)
			minWeight = weights[i];
	}
	if (minWeight < 1)
		for (int i = 0; i < docCount; i++)
			weights[i] = weights[i] / minWeight;

	corpusSize = 0;
	for (int i = 0; i < docCount; i++)
//		corpusSize += 10000 * weights[i];
		corpusSize += docModels[i]->corpusSize * weights[i];

	for (iter = terms.begin(); iter != terms.end(); ++iter) {
		char *term = (char*)iter->c_str();
		double frequency = 0;
		offset documents = 0;
		for (int i = 0; i < docCount; i++) {
			double p = docModels[i]->getTermProbability(term);
			if (p > 1E-9) {
//				frequency += 10000 * weights[i] * p;
				frequency += docModels[i]->corpusSize * weights[i] * p;
				documents++;
			}
		}
		addTerm(term, LROUND(frequency), documents);
	}
} // end of buildModelWeighted(LanguageModel**, double*, int)


void RelevanceModel::buildModelLavrenko1(
		LanguageModel **docModels, double *weights, int docCount, char **queryTerms, int termCount) {
	set<string> terms;
	getTermSet(docModels, docCount, &terms);
	set<string>::iterator iter;

	LanguageModel *backgroundModel = index->getStaticLanguageModel();
	assert(backgroundModel != NULL);

	InterpolationLanguageModel *interpolatedDocModels[MAX_DOC_COUNT];
	double interpolationWeights[2] = {0.8, 0.2};
	for (int i = 0; i < docCount; i++) {
		AbstractLanguageModel *models[2];
		models[0] = docModels[i];
		models[1] = backgroundModel;
		interpolatedDocModels[i] =
			new InterpolationLanguageModel(models, interpolationWeights, 2);
	}

	// compute how likely the query is to be generated by each of the document
	// models; Lavrenko uses this to weight document models when computing the
	// final relevance model
	double queryLikelihoodInModel[MAX_DOC_COUNT];
	for (int i = 0; i < termCount; i++) {
		double likelihood = 1;
		for (int k = 0; k < docCount; k++) {
			double q = interpolatedDocModels[k]->getTermProbability(queryTerms[i]);
			if (q < 1.0 / backgroundModel->corpusSize)
				q = 1.0 / backgroundModel->corpusSize;
			likelihood *= q;
		}
		queryLikelihoodInModel[i] = likelihood;
	}

	// compute term probabilities for all terms appearing in at least one of the
	// input documents, according to Lavrenko's method 1
	map<string,double> termProbs;
	double probSum = 0;
	corpusSize = 1E9;
	for (iter = terms.begin(); iter != terms.end(); ++iter) {
		char *term = (char*)iter->c_str();
		if (getTermID(term) >= 0)
			continue;
		double p = 0;
		for (int k = 0; k < docCount; k++) {
			double termProb = interpolatedDocModels[k]->getTermProbability(term);
			if (termProb < 1.0 / backgroundModel->corpusSize)
				termProb = 1.0 / backgroundModel->corpusSize;
			p += 1.0 / docCount * termProb * queryLikelihoodInModel[k];
		}
		addTerm(term, LROUND(corpusSize * p), 1);
		termProbs[term] = p;
		probSum += p;
	}

	// normalize probability values so that we get a sum of 1
	map<string,double>::iterator iter2;
	for (iter2 = termProbs.begin(); iter2 != termProbs.end(); ++iter2) {
		char *term = (char*)iter2->first.c_str();
		double p = iter2->second / probSum;
		addTerm(term, LROUND(corpusSize * p), 1);
	}

	// free temporary resources
	for (int i = 0; i < docCount; i++)
		delete interpolatedDocModels[i];
} // end of buildModelLavrenko1(...)


void RelevanceModel::buildModelLavrenko2(
		LanguageModel **docModels, double *weights, int docCount, char **queryTerms, int termCount) {
	set<string> terms;
	getTermSet(docModels, docCount, &terms);
	set<string>::iterator iter;

	LanguageModel *backgroundModel = index->getStaticLanguageModel();
	assert(backgroundModel != NULL);

	InterpolationLanguageModel *interpolatedDocModels[MAX_DOC_COUNT];
	double interpolationWeights[2] = {0.6, 0.4};
	for (int i = 0; i < docCount; i++) {
		AbstractLanguageModel *models[2];
		models[0] = docModels[i];
		models[1] = backgroundModel;
		interpolatedDocModels[i] =
			new InterpolationLanguageModel(models, interpolationWeights, 2);
	}

	// compute term probabilities for all terms appearing in at least one of the
	// input documents, according to Lavrenko's method 2
	map<string,double> termProbs;
	double probSum = 0;
	corpusSize = 1E9;
	for (iter = terms.begin(); iter != terms.end(); ++iter) {
		char *term = (char*)iter->c_str();
		if (getTermID(term) >= 0)
			continue;
		double globalTermProb = 0;
		for (int k = 0; k < docCount; k++)
			globalTermProb += interpolatedDocModels[k]->getTermProbability(term) / docCount;
		double p = globalTermProb;
		for (int i = 0; i < termCount; i++) {
			double queryTermProb = 0;
			for (int k = 0; k < docCount; k++) {
				double localTermProb = interpolatedDocModels[k]->getTermProbability(term);
				if (localTermProb < 1.0 / backgroundModel->corpusSize)
					localTermProb = 1.0 / backgroundModel->corpusSize;
				double q = interpolatedDocModels[k]->getTermProbability(queryTerms[i]);
				if (q < 1.0 / backgroundModel->corpusSize)
					q = 1.0 / backgroundModel->corpusSize;
				q = q * globalTermProb * localTermProb / docCount;
				queryTermProb += q;
			}
			p *= queryTermProb;
		}
		addTerm(term, LROUND(corpusSize * p), 1);
		termProbs[term] = p;
		probSum += p;
	}

	// normalize probability values so that we get a sum of 1
	map<string,double>::iterator iter2;
	for (iter2 = termProbs.begin(); iter2 != termProbs.end(); ++iter2) {
		char *term = (char*)iter2->first.c_str();
		double p = iter2->second / probSum;
		addTerm(term, LROUND(corpusSize * p), 1);
	}

	// free temporary resources
	for (int i = 0; i < docCount; i++)
		delete interpolatedDocModels[i];
} // end of buildModelLavrenko2(...)




